1 package org.apache.lucene.util.automaton;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.util.ArrayList;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.LinkedList;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.Random;
27 import java.util.Set;
28
29 import org.apache.lucene.util.ArrayUtil;
30 import org.apache.lucene.util.IntsRef;
31 import org.apache.lucene.util.IntsRefBuilder;
32 import org.apache.lucene.util.TestUtil;
33 import org.apache.lucene.util.UnicodeUtil;
34
35
36
37
38
39
40
41
42 public class AutomatonTestUtil {
43
44
45
46 public static final int DEFAULT_MAX_DETERMINIZED_STATES = 1000000;
47
48
49 public static String randomRegexp(Random r) {
50 while (true) {
51 String regexp = randomRegexpString(r);
52
53 if (!UnicodeUtil.validUTF16String(regexp))
54 continue;
55 try {
56 new RegExp(regexp, RegExp.NONE);
57 return regexp;
58 } catch (Exception e) {}
59 }
60 }
61
62 private static String randomRegexpString(Random r) {
63 final int end = r.nextInt(20);
64 if (end == 0) {
65
66 return "";
67 }
68 final char[] buffer = new char[end];
69 for (int i = 0; i < end; i++) {
70 int t = r.nextInt(15);
71 if (0 == t && i < end - 1) {
72
73
74 buffer[i++] = (char) TestUtil.nextInt(r, 0xd800, 0xdbff);
75
76 buffer[i] = (char) TestUtil.nextInt(r, 0xdc00, 0xdfff);
77 }
78 else if (t <= 1) buffer[i] = (char) r.nextInt(0x80);
79 else if (2 == t) buffer[i] = (char) TestUtil.nextInt(r, 0x80, 0x800);
80 else if (3 == t) buffer[i] = (char) TestUtil.nextInt(r, 0x800, 0xd7ff);
81 else if (4 == t) buffer[i] = (char) TestUtil.nextInt(r, 0xe000, 0xffff);
82 else if (5 == t) buffer[i] = '.';
83 else if (6 == t) buffer[i] = '?';
84 else if (7 == t) buffer[i] = '*';
85 else if (8 == t) buffer[i] = '+';
86 else if (9 == t) buffer[i] = '(';
87 else if (10 == t) buffer[i] = ')';
88 else if (11 == t) buffer[i] = '-';
89 else if (12 == t) buffer[i] = '[';
90 else if (13 == t) buffer[i] = ']';
91 else if (14 == t) buffer[i] = '|';
92 }
93 return new String(buffer, 0, end);
94 }
95
96
97
98
99 private static int getRandomCodePoint(final Random r, int min, int max) {
100 final int code;
101 if (max < UnicodeUtil.UNI_SUR_HIGH_START ||
102 min > UnicodeUtil.UNI_SUR_HIGH_END) {
103
104 code = min+r.nextInt(max-min+1);
105 } else if (min >= UnicodeUtil.UNI_SUR_HIGH_START) {
106 if (max > UnicodeUtil.UNI_SUR_LOW_END) {
107
108 code = 1+UnicodeUtil.UNI_SUR_LOW_END+r.nextInt(max-UnicodeUtil.UNI_SUR_LOW_END);
109 } else {
110 throw new IllegalArgumentException("transition accepts only surrogates: min=" + min + " max=" + max);
111 }
112 } else if (max <= UnicodeUtil.UNI_SUR_LOW_END) {
113 if (min < UnicodeUtil.UNI_SUR_HIGH_START) {
114
115 code = min + r.nextInt(UnicodeUtil.UNI_SUR_HIGH_START - min);
116 } else {
117 throw new IllegalArgumentException("transition accepts only surrogates: min=" + min + " max=" + max);
118 }
119 } else {
120
121 int gap1 = UnicodeUtil.UNI_SUR_HIGH_START - min;
122 int gap2 = max - UnicodeUtil.UNI_SUR_LOW_END;
123 int c = r.nextInt(gap1+gap2);
124 if (c < gap1) {
125 code = min + c;
126 } else {
127 code = UnicodeUtil.UNI_SUR_LOW_END + c - gap1 + 1;
128 }
129 }
130
131 assert code >= min && code <= max && (code < UnicodeUtil.UNI_SUR_HIGH_START || code > UnicodeUtil.UNI_SUR_LOW_END):
132 "code=" + code + " min=" + min + " max=" + max;
133 return code;
134 }
135
136
137
138
139
140
141
142
143 public static class RandomAcceptedStrings {
144
145 private final Map<Transition,Boolean> leadsToAccept;
146 private final Automaton a;
147 private final Transition[][] transitions;
148
149 private static class ArrivingTransition {
150 final int from;
151 final Transition t;
152
153 public ArrivingTransition(int from, Transition t) {
154 this.from = from;
155 this.t = t;
156 }
157 }
158
159 public RandomAcceptedStrings(Automaton a) {
160 this.a = a;
161 if (a.getNumStates() == 0) {
162 throw new IllegalArgumentException("this automaton accepts nothing");
163 }
164 this.transitions = a.getSortedTransitions();
165
166 leadsToAccept = new HashMap<>();
167 final Map<Integer,List<ArrivingTransition>> allArriving = new HashMap<>();
168
169 final LinkedList<Integer> q = new LinkedList<>();
170 final Set<Integer> seen = new HashSet<>();
171
172
173
174 int numStates = a.getNumStates();
175 for(int s=0;s<numStates;s++) {
176 for(Transition t : transitions[s]) {
177 List<ArrivingTransition> tl = allArriving.get(t.dest);
178 if (tl == null) {
179 tl = new ArrayList<>();
180 allArriving.put(t.dest, tl);
181 }
182 tl.add(new ArrivingTransition(s, t));
183 }
184 if (a.isAccept(s)) {
185 q.add(s);
186 seen.add(s);
187 }
188 }
189
190
191
192 while (q.isEmpty() == false) {
193 final int s = q.removeFirst();
194 List<ArrivingTransition> arriving = allArriving.get(s);
195 if (arriving != null) {
196 for(ArrivingTransition at : arriving) {
197 final int from = at.from;
198 if (!seen.contains(from)) {
199 q.add(from);
200 seen.add(from);
201 leadsToAccept.put(at.t, Boolean.TRUE);
202 }
203 }
204 }
205 }
206 }
207
208 public int[] getRandomAcceptedString(Random r) {
209
210 final List<Integer> soFar = new ArrayList<>();
211
212 int s = 0;
213
214 while(true) {
215
216 if (a.isAccept(s)) {
217 if (a.getNumTransitions(s) == 0) {
218
219 break;
220 } else {
221 if (r.nextBoolean()) {
222 break;
223 }
224 }
225 }
226
227 if (a.getNumTransitions(s) == 0) {
228 throw new RuntimeException("this automaton has dead states");
229 }
230
231 boolean cheat = r.nextBoolean();
232
233 final Transition t;
234 if (cheat) {
235
236
237 List<Transition> toAccept = new ArrayList<>();
238 for(Transition t0 : transitions[s]) {
239 if (leadsToAccept.containsKey(t0)) {
240 toAccept.add(t0);
241 }
242 }
243 if (toAccept.size() == 0) {
244
245 t = transitions[s][r.nextInt(transitions[s].length)];
246 } else {
247 t = toAccept.get(r.nextInt(toAccept.size()));
248 }
249 } else {
250 t = transitions[s][r.nextInt(transitions[s].length)];
251 }
252 soFar.add(getRandomCodePoint(r, t.min, t.max));
253 s = t.dest;
254 }
255
256 return ArrayUtil.toIntArray(soFar);
257 }
258 }
259
260 private static Automaton randomSingleAutomaton(Random random) {
261 while (true) {
262 try {
263 Automaton a1 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton();
264 if (random.nextBoolean()) {
265 a1 = Operations.complement(a1, DEFAULT_MAX_DETERMINIZED_STATES);
266 }
267 return a1;
268 } catch (TooComplexToDeterminizeException tctde) {
269
270 }
271 }
272 }
273
274
275 public static Automaton randomAutomaton(Random random) {
276
277 Automaton a1 = randomSingleAutomaton(random);
278 Automaton a2 = randomSingleAutomaton(random);
279
280
281 switch (random.nextInt(4)) {
282 case 0: return Operations.concatenate(a1, a2);
283 case 1: return Operations.union(a1, a2);
284 case 2: return Operations.intersection(a1, a2);
285 default: return Operations.minus(a1, a2, DEFAULT_MAX_DETERMINIZED_STATES);
286 }
287 }
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326 public static Automaton minimizeSimple(Automaton a) {
327 Set<Integer> initialSet = new HashSet<Integer>();
328 a = determinizeSimple(Operations.reverse(a, initialSet), initialSet);
329 initialSet.clear();
330 a = determinizeSimple(Operations.reverse(a, initialSet), initialSet);
331 return a;
332 }
333
334
335
336
337 public static Automaton determinizeSimple(Automaton a) {
338 Set<Integer> initialset = new HashSet<>();
339 initialset.add(0);
340 return determinizeSimple(a, initialset);
341 }
342
343
344
345
346
347 public static Automaton determinizeSimple(Automaton a, Set<Integer> initialset) {
348 if (a.getNumStates() == 0) {
349 return a;
350 }
351 int[] points = a.getStartPoints();
352
353 Map<Set<Integer>, Set<Integer>> sets = new HashMap<>();
354 LinkedList<Set<Integer>> worklist = new LinkedList<>();
355 Map<Set<Integer>, Integer> newstate = new HashMap<>();
356 sets.put(initialset, initialset);
357 worklist.add(initialset);
358 Automaton.Builder result = new Automaton.Builder();
359 result.createState();
360 newstate.put(initialset, 0);
361 Transition t = new Transition();
362 while (worklist.size() > 0) {
363 Set<Integer> s = worklist.removeFirst();
364 int r = newstate.get(s);
365 for (int q : s) {
366 if (a.isAccept(q)) {
367 result.setAccept(r, true);
368 break;
369 }
370 }
371 for (int n = 0; n < points.length; n++) {
372 Set<Integer> p = new HashSet<>();
373 for (int q : s) {
374 int count = a.initTransition(q, t);
375 for(int i=0;i<count;i++) {
376 a.getNextTransition(t);
377 if (t.min <= points[n] && points[n] <= t.max) {
378 p.add(t.dest);
379 }
380 }
381 }
382
383 if (!sets.containsKey(p)) {
384 sets.put(p, p);
385 worklist.add(p);
386 newstate.put(p, result.createState());
387 }
388 int q = newstate.get(p);
389 int min = points[n];
390 int max;
391 if (n + 1 < points.length) {
392 max = points[n + 1] - 1;
393 } else {
394 max = Character.MAX_CODE_POINT;
395 }
396 result.addTransition(r, q, min, max);
397 }
398 }
399
400 return Operations.removeDeadStates(result.finish());
401 }
402
403
404
405
406
407
408
409
410
411
412
413
414
415 public static Set<IntsRef> getFiniteStringsRecursive(Automaton a, int limit) {
416 HashSet<IntsRef> strings = new HashSet<>();
417 if (!getFiniteStrings(a, 0, new HashSet<Integer>(), strings, new IntsRefBuilder(), limit)) {
418 return strings;
419 }
420 return strings;
421 }
422
423
424
425
426
427
428 private static boolean getFiniteStrings(Automaton a, int s, HashSet<Integer> pathstates,
429 HashSet<IntsRef> strings, IntsRefBuilder path, int limit) {
430 pathstates.add(s);
431 Transition t = new Transition();
432 int count = a.initTransition(s, t);
433 for (int i=0;i<count;i++) {
434 a.getNextTransition(t);
435 if (pathstates.contains(t.dest)) {
436 return false;
437 }
438 for (int n = t.min; n <= t.max; n++) {
439 path.append(n);
440 if (a.isAccept(t.dest)) {
441 strings.add(path.toIntsRef());
442 if (limit >= 0 && strings.size() > limit) {
443 return false;
444 }
445 }
446 if (!getFiniteStrings(a, t.dest, pathstates, strings, path, limit)) {
447 return false;
448 }
449 path.setLength(path.length() - 1);
450 }
451 }
452 pathstates.remove(s);
453 return true;
454 }
455
456
457
458
459
460
461
462 public static boolean isFiniteSlow(Automaton a) {
463 if (a.getNumStates() == 0) {
464 return true;
465 }
466 return isFiniteSlow(a, 0, new HashSet<Integer>());
467 }
468
469
470
471
472
473
474
475 private static boolean isFiniteSlow(Automaton a, int s, HashSet<Integer> path) {
476 path.add(s);
477 Transition t = new Transition();
478 int count = a.initTransition(s, t);
479 for (int i=0;i<count;i++) {
480 a.getNextTransition(t);
481 if (path.contains(t.dest) || !isFiniteSlow(a, t.dest, path)) {
482 return false;
483 }
484 }
485 path.remove(s);
486 return true;
487 }
488
489
490
491
492
493 public static void assertNoDetachedStates(Automaton a) {
494 Automaton a2 = Operations.removeDeadStates(a);
495 assert a.getNumStates() == a2.getNumStates() : "automaton has " + (a.getNumStates() - a2.getNumStates()) + " detached states";
496 }
497
498
499 public static boolean isDeterministicSlow(Automaton a) {
500 Transition t = new Transition();
501 int numStates = a.getNumStates();
502 for(int s=0;s<numStates;s++) {
503 int count = a.initTransition(s, t);
504 int lastMax = -1;
505 for(int i=0;i<count;i++) {
506 a.getNextTransition(t);
507 if (t.min <= lastMax) {
508 assert a.isDeterministic() == false;
509 return false;
510 }
511 lastMax = t.max;
512 }
513 }
514
515 assert a.isDeterministic() == true;
516 return true;
517 }
518
519 }